In this competition we were given dotted training files as labels for the sea lions in each image instead of coordinates. Not only are the dotted training files useless without coordinates, but many of the reported numbers in train.csv are incorrect.
The goal of this kernel is to illustrate exactly what I did to generate a coordinates file, and a corrected train.csv.
These steps do not need to be performed again, you can just download the coordinates file here and count file here, then check my results using the coordinate marking function in the "Visualizing Coordinates" section.
What I will cover:
*you can setup your folders however you want, but to be compatible with the code below, I will illustrate my folder structure**
I have 3 folders:
Radu's Method was implemented here to extract the coordinates
In [ ]:
# imports
import numpy as np
import pandas as pd
import os
import cv2
import matplotlib.pyplot as plt
import skimage.feature
from tqdm import tqdm # nice progress bars
%matplotlib inline
# constants
TRAIN_PATH = '../data/Train/'
DOTTED_PATH = '../data/TrainDotted/'
OUT_PATH = '../output/'
ALL_FILE_NAMES = os.listdir(DOTTED_PATH) # all our training file names
ALL_FILE_NAMES = sorted(ALL_FILE_NAMES, key = lambda item: int(item.partition('.')[0]))
Due to the fact some images are mismatched in the training set, and will not work for this method (reference to datacanary's post), I removed those images from the entire list of training files.
In [ ]:
MISMATCHED_TRAIN = [3, 7, 9, 21, 30, 34, 71, 81, 89, 97, 151, 184, 215, 234, 242, 268, 290, 311, 331, 344, 380, 384, 406, 421, 469, 475, 490, 499, 507, 530, 531, 605, 607, 614, 621, 638, 644, 687, 712, 721, 767, 779, 781, 794, 800, 811, 839, 840, 869, 882, 901, 903, 905, 909, 913, 927, 946]
FILE_NAMES = []
for filename in ALL_FILE_NAMES:
if int(filename.partition('.')[0]) in MISMATCHED_TRAIN:
pass
else:
FILE_NAMES.append(filename) # create FILE_NAMES without MISMATCHED_TRAIN images
Now I created two pandas dataframes that will later be saved to csv files. One of them (count_df), will record the number of each category of sea lion the method was able to discover in the image.
In [ ]:
count_df = pd.DataFrame(index = FILE_NAMES, columns = ["adult_males", "subadult_males", "adult_females", "juveniles", "pups"]).fillna(0)
The other dataframe (coordinates_df), will record the coordinates of every discovered sea lion (y_coord, x_coord), the image the sea lion is from (filename), and the category of the sea lion.
In [ ]:
coordinates_df = pd.DataFrame(columns = ["filename", "y_coord", "x_coord", "category"]).fillna(0)
Then I iterated over all the training files, and extracted the y_coord, x_coord, and category according to Radu's Method, and saved the count_df dataframe to initial_count.csv, and the coordinates_df dataframe to initial_coordinates.csv
In [ ]:
for filename in tqdm(FILE_NAMES):
img_dotted = cv2.imread(DOTTED_PATH + filename)
img_train = cv2.imread(TRAIN_PATH + filename)
img_diff = cv2.absdiff(img_train , img_dotted)
mask_1 = cv2.cvtColor(img_dotted, cv2.COLOR_BGR2GRAY)
mask_1[mask_1 < 20] = 0
mask_1[mask_1 > 0] = 255
mask_2 = cv2.cvtColor(img_train, cv2.COLOR_BGR2GRAY)
mask_2[mask_2 < 20] = 0
mask_2[mask_2 > 0] = 255
img_diff = cv2.bitwise_or(img_diff, img_diff, mask=mask_1)
img_diff = cv2.bitwise_or(img_diff, img_diff, mask=mask_2)
img_diff = cv2.cvtColor(img_diff, cv2.COLOR_BGR2GRAY)
blobs = skimage.feature.blob_log(img_diff, min_sigma=3, max_sigma=4, num_sigma=1, threshold=0.02)
for blob in blobs:
y, x, s = blob
b,g,r = img_dotted[int(y)][int(x)][:]
if r > 204 and g < 29 and b < 26: # RED
count_df["adult_males"][filename] += 1
new_row = pd.Series([filename, int(y), int(x), "adult_males"], index=["filename", "y_coord", "x_coord", "category"])
coordinates_df = coordinates_df.append(new_row, ignore_index=True)
elif r > 220 and g < 25 and b > 204: # MAGENTA
count_df["subadult_males"][filename] += 1
new_row = pd.Series([filename, int(y), int(x), "subadult_males"], index=["filename", "y_coord", "x_coord", "category"])
coordinates_df = coordinates_df.append(new_row, ignore_index=True)
elif 6 < r < 64 and 156 < g < 199 and b < 52: # GREEN
count_df["pups"][filename] += 1
new_row = pd.Series([filename, int(y), int(x), "pups"], index=["filename", "y_coord", "x_coord", "category"])
coordinates_df = coordinates_df.append(new_row, ignore_index=True)
elif r < 78 and 31 < g < 85 and 124 < b < 221: # BLUE
count_df["juveniles"][filename] += 1
new_row = pd.Series([filename, int(y), int(x), "juveniles"], index=["filename", "y_coord", "x_coord", "category"])
coordinates_df = coordinates_df.append(new_row, ignore_index=True)
elif 59 < r < 115 and 19 < g < 80 and b < 49: # BROWN
count_df["adult_females"][filename] += 1
new_row = pd.Series([filename, int(y), int(x), "adult_females"], index=["filename", "y_coord", "x_coord", "category"])
coordinates_df = coordinates_df.append(new_row, ignore_index=True)
count_df.to_csv(OUT_PATH + 'initial_count.csv')
coordinates_df.to_csv(OUT_PATH + 'initial_coordinates.csv')
Here I compared the number of sea lions counted by Radu's method, to the "true" counts given in train.csv. The function below takes in the count file (initial_count.csv) and compares it to train.csv. It then prints a list of images along with the specific categories of sea lions and the predicted values by Radu's method, versus the "true" values from train.csv
In [ ]:
def report_error(count_file):
# checking that the generated "initial_count.csv" matches "train.csv" true sea lion numbers
count_df = pd.read_csv(OUT_PATH + count_file, index_col=0)
true_count_df = pd.read_csv(TRAIN_PATH + 'train.csv')
categories = ["adult_males", "subadult_males", "adult_females", "juveniles", "pups"]
wrong_files_dict = {}
for filename, row in count_df.iterrows():
train_id = int(filename.partition('.')[0])
wrong_list = []
for category in categories:
predicted_val = int(row[category])
true_val = int(true_count_df[category][train_id])
if predicted_val != true_val:
wrong_list.append([category, predicted_val, true_val])
if len(wrong_list) != 0:
wrong_files_dict[int(filename.partition('.')[0])] = wrong_list
wrong_files_list = list(wrong_files_dict.keys())
wrong_files_list = sorted(wrong_files_list, key=int)
for img_id in wrong_files_list:
filename = str(img_id) + '.jpg'
wrong_categories = wrong_files_dict[img_id]
print(filename)
for item in wrong_categories:
category = item[0]
predicted_val = item[1]
true_val = item[2]
print(' ' + category + ': predicted=' + str(predicted_val) + ', True=' + str(true_val))
In [ ]:
report_error('initial_count.csv')
I noticed that there were many images with discrepancies between our generated counts and the train.csv counts. The only way to verify whether or not the generated counts were correct or the train.csv were correct, was to manually check the coordinates for the images with discrepencies.
Note: Images without discrepancies between generated and train.csv counts will not be manually checked, and will be assumed to have accurate coordinates and counts. Other images may be manually checked later, check the bottom for "Edits" to the data files
In order to check the validity of the coordinates, they must be visualized on the image.
Radu's Method was implemented here to visualize the extracted coordinates on the dotted training images
Using the function defined below (graph_coord_circles), all we need to do is give it a list of file names, and the name of the coordinates file generated before (initial_coordinates.csv) and the function will draw circles around the coordinates of every file, and save a new jpg file to the output folder with those circles on the image. This will allow us to check if the coordinates are centered correctly on the sea lions.
In [ ]:
def graph_coord_circles(FILE_NAMES, coord_file):
coordinates_df = pd.read_csv(OUT_PATH + coord_file)
for filename in FILE_NAMES:
new_df = coordinates_df.loc[coordinates_df['filename'] == filename]
dotted_img = cv2.imread(DOTTED_PATH + filename)
for index, row in new_df.iterrows():
if row['category'] == 'adult_males':
cv2.circle(dotted_img, (int(row['x_coord']), int(row['y_coord'])), 8, (0,0,255), 2)
elif row['category'] == 'subadult_males':
cv2.circle(dotted_img, (int(row['x_coord']), int(row['y_coord'])), 8, (250,10,250), 2)
elif row['category'] == 'pups':
cv2.circle(dotted_img, (int(row['x_coord']), int(row['y_coord'])), 8, (20,180,35), 2)
elif row['category'] == 'juveniles':
cv2.circle(dotted_img, (int(row['x_coord']), int(row['y_coord'])), 8, (180,60,30), 2)
elif row['category'] == 'adult_females':
cv2.circle(dotted_img, (int(row['x_coord']), int(row['y_coord'])), 8, (0,42,84), 2)
cv2.imwrite(OUT_PATH + str(filename.partition('.')[0]) + '_marked.jpg', dotted_img)
In [ ]:
# uncomment the line below and run this cell to generate marked images for all the training files
# graph_coord_circles(FILE_NAMES, 'initial_coordinates.csv')
Using the images with circles on the coordinates, I checked the images with discrepencies for a couple things:
Based on the conditions above, the data (coordinates and counts) for a specific image may be incorrect. The correct data is assumed to be the initial_coordinates.csv, and I modify the coordinates in initial_coordinates.csv by adding coordinates for missing sea lions and removing coordinates for non-sea lions.
In [ ]:
# first load in the data from initial_coordinates.csv
correct_coordinates_df = pd.read_csv(OUT_PATH + 'initial_coordinates.csv', index_col=0)
My changes to the images were recorded in the changes.csv file located in this repository. In changes.csv I made lists of coordinates to add (coord_add column of csv) and coordinates to remove (coord_remove column of csv) for every single image. To apply the changes from the changes.csv, coordinates must be added and removed from initial_coordinates.csv, and the counts of sea lions in each image readjusted.
In [ ]:
# getting list of good image ids
IMG_IDS = []
for filename in FILE_NAMES:
IMG_IDS.append(int(filename.partition('.')[0]))
# function to apply changes, and get corect coordinates and counts
def apply_all_changes():
changes_df = pd.read_csv('./changes.csv', index_col='img_id')
# getting all image ids
img_ids = list(changes_df.index)
for img_id in img_ids:
# first change new_coord_df
filename = str(img_id) + '.jpg'
mini_changes_df = changes_df.ix[int(img_id)] # only 1 row
coord_add_list = ast.literal_eval(mini_changes_df[0])
coord_remove_list = ast.literal_eval(mini_changes_df[1])
for coord_add in coord_add_list:
if len(coord_add) == 0:
continue
y_coord = int(coord_add[0])
x_coord = int(coord_add[1])
category = coord_add[2]
# changing new_coord_df to add coordinate
new_row = pd.Series([filename, y_coord, x_coord, category], index=["filename", "y_coord", "x_coord", "category"])
new_coord_df = new_coord_df.append(new_row, ignore_index=True)
for coord_remove in coord_remove_list:
if len(coord_remove) == 0:
continue
y_coord = coord_remove[0]
x_coord = coord_remove[1]
category = coord_remove[2]
# changing new_coord_df to remove coordinate
mask = (new_coord_df['filename'] == filename) & (new_coord_df['y_coord'] == y_coord) & (new_coord_df['x_coord'] == x_coord) & (new_coord_df['category'] == category)
new_coord_df= new_coord_df[~mask]
new_coord_df.to_csv(OUT_PATH + 'correct_coordinates.csv') # save correct coordinates
# next create a new file with correct counts of sea lions
new_counts_df = pd.DataFrame(index = IMG_IDS, columns = ["adult_males", "subadult_males", "adult_females", "juveniles", "pups"]).fillna(0)
for row in new_coord_df.iterrows():
filename = row[1]['filename']
file_id = int(filename.partition('.')[0])
category = row[1]['category']
new_counts_df[category][file_id] +=1
new_counts_df.to_csv(OUT_PATH + 'correct_train.csv',index_label='train_id')
apply_all_changes()
Now the final results have been generated and the correct coordinates and counts are located in correct_coordinates.csv, and correct_train.csv respectively. If you run this entire notebook up to this point, you should be able to generate the exact same coordinates file from here and counts file from here. Take note that for all the bad images (images from Mismatched Train), they will not have coordinates for their file within the correct_coordinates.csv.